In [1]:
import pandas as pd
import sklearn.neural_network as sklnn
import sklearn.tree as skltr

In [2]:
shots = pd.read_csv('./shot_logs.csv')

In [3]:
shots.isnull().any(axis=1).sum()


Out[3]:
5567

In [4]:
#Lets get rid off null values
shots.fillna(0, inplace=True)
shots.head()


Out[4]:
GAME_ID MATCHUP LOCATION W FINAL_MARGIN SHOT_NUMBER PERIOD GAME_CLOCK SHOT_CLOCK DRIBBLES ... SHOT_DIST PTS_TYPE SHOT_RESULT CLOSEST_DEFENDER CLOSEST_DEFENDER_PLAYER_ID CLOSE_DEF_DIST FGM PTS player_name player_id
0 21400899 MAR 04, 2015 - CHA @ BKN A W 24 1 1 1:09 10.8 2 ... 7.7 2 made Anderson, Alan 101187 1.3 1 2 brian roberts 203148
1 21400899 MAR 04, 2015 - CHA @ BKN A W 24 2 1 0:14 3.4 0 ... 28.2 3 missed Bogdanovic, Bojan 202711 6.1 0 0 brian roberts 203148
2 21400899 MAR 04, 2015 - CHA @ BKN A W 24 3 1 0:00 0.0 3 ... 10.1 2 missed Bogdanovic, Bojan 202711 0.9 0 0 brian roberts 203148
3 21400899 MAR 04, 2015 - CHA @ BKN A W 24 4 2 11:47 10.3 2 ... 17.2 2 missed Brown, Markel 203900 3.4 0 0 brian roberts 203148
4 21400899 MAR 04, 2015 - CHA @ BKN A W 24 5 2 10:34 10.9 2 ... 3.7 2 missed Young, Thaddeus 201152 1.1 0 0 brian roberts 203148

5 rows × 21 columns


In [5]:
# Let's drop some columns that don't make sense
treated = shots.drop(['GAME_ID', 'MATCHUP', 'LOCATION', 'W', 'FINAL_MARGIN', 'FGM', 'PTS', 'CLOSEST_DEFENDER', 'CLOSEST_DEFENDER_PLAYER_ID', 'player_name', 'player_id'], axis=1)
treated.head()


Out[5]:
SHOT_NUMBER PERIOD GAME_CLOCK SHOT_CLOCK DRIBBLES TOUCH_TIME SHOT_DIST PTS_TYPE SHOT_RESULT CLOSE_DEF_DIST
0 1 1 1:09 10.8 2 1.9 7.7 2 made 1.3
1 2 1 0:14 3.4 0 0.8 28.2 3 missed 6.1
2 3 1 0:00 0.0 3 2.7 10.1 2 missed 0.9
3 4 2 11:47 10.3 2 1.9 17.2 2 missed 3.4
4 5 2 10:34 10.9 2 2.7 3.7 2 missed 1.1

In [6]:
# Game Clock is a string, we should convert it to seconds
def clockToSeconds(clock):
    mins, secs = clock.split(':')
    return int(mins)*60 + int(secs)
treated['GAME_CLOCK'] = treated['GAME_CLOCK'].apply(clockToSeconds)
treated.head()


Out[6]:
SHOT_NUMBER PERIOD GAME_CLOCK SHOT_CLOCK DRIBBLES TOUCH_TIME SHOT_DIST PTS_TYPE SHOT_RESULT CLOSE_DEF_DIST
0 1 1 69 10.8 2 1.9 7.7 2 made 1.3
1 2 1 14 3.4 0 0.8 28.2 3 missed 6.1
2 3 1 0 0.0 3 2.7 10.1 2 missed 0.9
3 4 2 707 10.3 2 1.9 17.2 2 missed 3.4
4 5 2 634 10.9 2 2.7 3.7 2 missed 1.1

In [7]:
# Now lets shuffle our data and then split it into training and testing
# And extract the results for each sample
train = treated.sample(frac=0.7)
test = treated.drop(train.index)
train_result = train['SHOT_RESULT'].replace('made', 0).replace('missed', 1)
train.drop('SHOT_RESULT',axis=1, inplace=True)
test_result = test['SHOT_RESULT'].replace('made', 0).replace('missed', 1)
test.drop('SHOT_RESULT',axis=1, inplace=True)

In [13]:
classifier = sklnn.MLPClassifier(activation='relu', hidden_layer_sizes=(15, 10), max_iter=5000, warm_start=True)
classifier.fit(train, train_result)
prediction = classifier.predict(test)
print("Accuracy Using a (15,10) MLP: %.2f" % ((prediction==test_result).sum()/len(prediction)))


Accuracy Using a (15,10) MLP: 0.62

In [22]:
tree = skltr.DecisionTreeClassifier(criterion='entropy', max_features=7)
tree.fit(train, train_result)
prediction = tree.predict(test)
print("Accuracy using Decision Tree: %.2f" % ((prediction==test_result).sum()/len(prediction)))


Accuracy using Decision Tree: 0.54

In [21]:
print("Random guessing is around: %.2f" % test_result.mean())


Random guessing is around: 0.55

We can see that a Decision Tree classfier using entropy as the split criterion is not better than Random Guessing and a MLP classifier improves the prediction accuracy just a bit


In [ ]: